Source code for hysop.backend.device.opencl.opencl_copy_kernel_launchers

# Copyright (c) HySoP 2011-2024
#
# This file is part of HySoP software.
# See "https://particle_methods.gricad-pages.univ-grenoble-alpes.fr/hysop-doc/"
# for further info.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#     http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.


import numpy as np

from hysop import vprint, dprint, __KERNEL_DEBUG__, __TRACE_KERNELS__
from hysop.constants import Backend
from hysop.tools.profiler import FProfiler
from hysop.tools.decorators import debug
from hysop.tools.htypes import check_instance, first_not_None, to_list
from hysop.tools.misc import prod
from hysop.tools.units import bytes2str
from hysop.tools.numpywrappers import npw
from hysop.core.arrays.all import Array, HostArray, OpenClArray
from hysop.backend.device.opencl import cl, clArray
from hysop.backend.device.opencl.opencl_kernel_launcher import (
    OpenClKernelLauncher,
    trace_kernel,
    profile_kernel,
)
from hysop.backend.device.opencl.opencl_kernel_statistics import OpenClKernelStatistics


[docs] class OpenClCopyKernelLauncher(OpenClKernelLauncher): """Interface to non-blocking OpenCL copy kernels.""" @debug def __init__(self, name, dst, src, enqueue_copy_kwds, apply_msg, **kwds): """ Initialize an OpenClCopyKernelLauncher. Parameters ---------- enqueue_copy_kwds: dict Arguments to to passed to pyopencl.enqueue_copy. """ assert "default_global_work_size" not in kwds assert "default_local_work_size" not in kwds assert "is_blocking" not in kwds enqueue_copy_kwds["dest"] = dst enqueue_copy_kwds["src"] = src if isinstance(src, np.ndarray) or isinstance(dst, np.ndarray): enqueue_copy_kwds["is_blocking"] = False super().__init__(name=name, kernel=None, args_list=(), **kwds) self._enqueue_copy_kwds = enqueue_copy_kwds self._apply_msg = apply_msg def _get_enqueue_copy_kwds(self): """ Return a copy of the keywords arguments that will be passed to pyopencl.enqueue_copy. """ return dict(self._enqueue_copy_kwds.items()) def __call__(self, queue=None, wait_for=None, **kwds): trace_kernel(" " + self._apply_msg) queue = first_not_None(queue, self._default_queue) if wait_for is not None: wait_for = to_list(wait_for) check_instance(queue, cl.CommandQueue) evt = cl.enqueue_copy(queue=queue, wait_for=wait_for, **self._enqueue_copy_kwds) profile_kernel(None, evt, self._apply_msg, fprofiler=self._profiler) return evt
[docs] def global_size_configured(self): return True
enqueue_copy_kwds = property(_get_enqueue_copy_kwds)
[docs] class OpenClCopyBufferLauncher(OpenClCopyKernelLauncher): """Non-blocking OpenCL copy kernel between host buffers and/or opencl device buffers.""" def __init__( self, varname, src, dst, src_device_offset=None, dst_device_offset=None, byte_count=None, **kwds, ): """ Initialize a (HOST <-> DEVICE) or a (DEVICE <-> DEVICE) copy kernel. Parameters ---------- varname: str Name of the variable copied for loggin purposes. src: cl.MemoryObjectHolder or np.ndarray The source buffer. dst: cl.MemoryObjectHolder or np.ndarray The destination buffer. src_device_offset: int, optional Offset in the source buffer, only valid if source buffer is a cl.MemoryObjectHolder. dst_device_offset: int, optional Offset in the source buffer, only valid if source buffer is a cl.MemoryObjectHolder. byte_count: int Byte count to copy if and only if source and destination buffers are cl.MemoryObjectHolders. Notes ----- The size of the transfer is controlled by the size of the of the host-side buffer. If the host-side buffer is a numpy.ndarray, you can control the transfer size by transfering into a smaller view of the target array by using indexing, If neither src nor dst are host buffers, the size is controlled by the parameter byte_count. Device buffers cannot have views like np.ndarrays, an offset in bytes can be given as src_device_offset or dst_device_offset instead. """ check_instance(src, (cl.MemoryObjectHolder, np.ndarray)) check_instance(dst, (cl.MemoryObjectHolder, np.ndarray)) check_instance(src_device_offset, (int, np.integer), allow_none=True) check_instance(dst_device_offset, (int, np.integer), allow_none=True) check_instance(byte_count, (int, np.integer), allow_none=True) msg = "Host to host copy is not supported." assert not (isinstance(src, np.ndarray) and isinstance(dst, np.ndarray)), msg enqueue_copy_kwds = {} if src_device_offset is not None: assert isinstance(src, cl.MemoryObjectHolder) enqueue_copy_kwds["src_offset"] = src_device_offset if dst_device_offset is not None: assert isinstance(dst, cl.MemoryObjectHolder) enqueue_copy_kwds["dst_offset"] = dst_device_offset if byte_count is not None: assert isinstance(src, cl.MemoryObjectHolder) assert isinstance(dst, cl.MemoryObjectHolder) enqueue_copy_kwds["byte_count"] = byte_count shape = first_not_None( (byte_count,), getattr(src, "shape", None), getattr(dst, "shape", None), "...", ) assert "name" not in kwds name = "enqueue_copy_{}__{}_to_{}".format( varname, "host" if isinstance(src, np.ndarray) else "device", "host" if isinstance(dst, np.ndarray) else "device", ) apply_msg = f"{name}<<<{shape}>>>" super().__init__( dst=dst, src=src, enqueue_copy_kwds=enqueue_copy_kwds, name=name, apply_msg=apply_msg, **kwds, ) def _format_host_arg(self, arg): if isinstance(arg, HostArray): arg = arg.data nbytes = arg.nbytes elif isinstance(arg, np.ndarray): nbytes = arg.size * arg.dtype.itemsize else: msg = "Unknown type {} to format device buffer arguments." msg = msg.format(type(arg)) raise TypeError(msg) return arg, nbytes def _format_device_arg(self, arg, arg_offset): nbytes = None if isinstance(arg, (OpenClArray, clArray.Array)): arg_offset = first_not_None(arg_offset, 0) nbytes = arg.nbytes - arg_offset arg_offset += arg.offset arg = arg.base_data elif isinstance(arg, cl.MemoryObjectHolder): pass else: msg = "Unknown type {} to format device buffer arguments." msg = msg.format(type(arg)) raise TypeError(msg) return (arg, arg_offset, nbytes)
[docs] class OpenClCopyHost2DeviceLauncher(OpenClCopyBufferLauncher): """Reduced interface for host to device copy kernels.""" def __init__(self, varname, src, dst, dst_device_offset=None): src, src_nbytes = self._format_host_arg(src) dst, dst_device_offset, dst_nbytes = self._format_device_arg( dst, dst_device_offset ) check_instance(src, (np.ndarray,)) check_instance(dst, (cl.MemoryObjectHolder,)) check_instance(dst_device_offset, (int, np.integer), allow_none=True) assert ( (src_nbytes is None) or (dst_nbytes is None) or (src_nbytes == dst_nbytes) ) super().__init__( varname=varname, src=src, dst=dst, dst_device_offset=dst_device_offset )
[docs] class OpenClCopyDevice2HostLauncher(OpenClCopyBufferLauncher): """Reduced interface for device to host copy kernels.""" def __init__(self, varname, src, dst, src_device_offset=None): src, src_device_offset, src_nbytes = self._format_device_arg( src, src_device_offset ) dst, dst_nbytes = self._format_host_arg(dst) check_instance(src, (cl.MemoryObjectHolder,)) check_instance(dst, (np.ndarray,)) check_instance(src_device_offset, (int, np.integer), allow_none=True) assert ( (src_nbytes is None) or (dst_nbytes is None) or (src_nbytes == dst_nbytes) ) super().__init__( varname=varname, src=src, dst=dst, src_device_offset=src_device_offset )
[docs] class OpenClCopyDevice2DeviceLauncher(OpenClCopyBufferLauncher): """Reduced interface for device to device copy kernels.""" def __init__( self, varname, src, dst, src_device_offset=None, dst_device_offset=None, byte_count=None, ): src, src_device_offset, src_nbytes = self._format_device_arg( src, src_device_offset ) dst, dst_device_offset, dst_nbytes = self._format_device_arg( dst, dst_device_offset ) byte_count = first_not_None(byte_count, min(src_nbytes, dst_nbytes)) check_instance(src, (cl.MemoryObjectHolder,)) check_instance(dst, (cl.MemoryObjectHolder,)) check_instance(src_device_offset, (int, np.integer), allow_none=True) check_instance(dst_device_offset, (int, np.integer), allow_none=True) check_instance(byte_count, (int, np.integer), allow_none=True) assert ( (src_nbytes is None) or (dst_nbytes is None) or (src_nbytes == dst_nbytes) ) super().__init__( varname=varname, src=src, dst=dst, src_device_offset=src_device_offset, dst_device_offset=dst_device_offset, byte_count=byte_count, )
[docs] class OpenClCopyBufferRectLauncher(OpenClCopyKernelLauncher): """ Non-blocking OpenCL copy kernel between host buffers and/or opencl device rectangle subregions of buffers (OpenCL 1.1 and newer). Supports n-dimensional strided arrays with dimension greater than 3 via iterating over 3D subregions. """ def __init__( self, varname, src, dst, copy_region, copy_src_origin, copy_dst_origin, copy_src_pitches, copy_dst_pitches, iter_region=None, iter_src_origin=None, iter_dst_origin=None, iter_src_pitches=None, iter_dst_pitches=None, **kwds, ): """ Initialize a (HOST <-> DEVICE) or a (DEVICE <-> DEVICE) rectangle subregions copy kernel. Parameters ---------- varname: str Name of the variable copied for loggin purposes. src: cl.MemoryObjectHolder or np.ndarray The source buffer. dst: cl.MemoryObjectHolder or np.ndarray The destination buffer. copy_region: tuple of ints The 3D region to copy in terms of bytes for the first dimension and of elements for the two last dimensions. copy_src_origin: tuple of ints The 3D offset in number of elements of the region associated with src buffer. The final src offset in bytes is computed from src_origin and src_pitch. copy_dst_origin: tuple of ints The 3D offset in number of elements of the region associated with dst buffer. The final dst offset in bytes is computed from dst_origin and dst_pitch. copy_src_pitches: tuple of ints The 2D pitches used to compute src offsets in bytes for the second and the third dimension. copy_dst_pitches: tuple of ints The 2D pitches used to compute dst offsets in bytes for the second and the third dimension. iter_region: tuple of ints The n-dimensional region to iterate if the copied region dimension is greater than 3. iter_src_origin: tuple of ints The n-dimensional src array origin if the copied region dimension is greater than 3. iter_dst_origin: tuple of ints The n-dimensional dst array origin if the copied region dimension is greater than 3. iter_src_pitches: tuple of ints The n-dimensional src array pitches if the copied region dimension is greater than 3. iter_dst_pitches: tuple of ints The n-dimensional dst array pitches if the copied region dimension is greater than 3. kwds: dict Base class arguments """ iter_region = first_not_None(iter_region, ()) iter_src_origin = first_not_None(iter_src_origin, ()) iter_dst_origin = first_not_None(iter_dst_origin, ()) iter_src_pitches = first_not_None(iter_src_pitches, ()) iter_dst_pitches = first_not_None(iter_dst_pitches, ()) check_instance(src, (cl.MemoryObjectHolder, np.ndarray)) check_instance(dst, (cl.MemoryObjectHolder, np.ndarray)) check_instance(copy_region, tuple, values=(int, np.integer), size=3) check_instance(copy_src_origin, tuple, values=(int, np.integer), size=3) check_instance(copy_dst_origin, tuple, values=(int, np.integer), size=3) check_instance(copy_src_pitches, tuple, values=(int, np.integer), size=2) check_instance(copy_dst_pitches, tuple, values=(int, np.integer), size=2) n = len(iter_region) check_instance(iter_region, tuple, values=(int, np.integer), size=n) check_instance(iter_src_origin, tuple, values=(int, np.integer), size=n) check_instance(iter_dst_origin, tuple, values=(int, np.integer), size=n) check_instance(iter_src_pitches, tuple, values=(int, np.integer), size=n) check_instance(iter_dst_pitches, tuple, values=(int, np.integer), size=n) enqueue_copy_kwds = {} enqueue_copy_kwds["region"] = copy_region if isinstance(src, np.ndarray) and isinstance(dst, np.ndarray): msg = "Host to host copy is not supported." raise RuntimeError(msg) elif isinstance(src, cl.MemoryObjectHolder) and isinstance( dst, cl.MemoryObjectHolder ): enqueue_copy_kwds["src_origin"] = copy_src_origin enqueue_copy_kwds["src_pitches"] = copy_src_pitches enqueue_copy_kwds["dst_origin"] = copy_dst_origin enqueue_copy_kwds["dst_pitches"] = copy_dst_pitches src_origin_kwd = "src_origin" dst_origin_kwd = "dst_origin" elif isinstance(src, cl.MemoryObjectHolder) and isinstance(dst, np.ndarray): enqueue_copy_kwds["host_origin"] = copy_dst_origin enqueue_copy_kwds["host_pitches"] = copy_dst_pitches enqueue_copy_kwds["buffer_origin"] = copy_src_origin enqueue_copy_kwds["buffer_pitches"] = copy_src_pitches src_origin_kwd = "buffer_origin" dst_origin_kwd = "host_origin" elif isinstance(src, np.ndarray) and isinstance(dst, cl.MemoryObjectHolder): enqueue_copy_kwds["host_origin"] = copy_src_origin enqueue_copy_kwds["host_pitches"] = copy_src_pitches enqueue_copy_kwds["buffer_origin"] = copy_dst_origin enqueue_copy_kwds["buffer_pitches"] = copy_dst_pitches src_origin_kwd = "host_origin" dst_origin_kwd = "buffer_origin" else: msg = "The impossible happened.\n *src={}\n *dst={}" msg = msg.format(type(src), type(dst)) raise ValueError(msg) assert "name" not in kwds name = "enqueue_copy_rect_{}__{}_to_{}".format( varname, "host" if isinstance(src, np.ndarray) else "device", "host" if isinstance(dst, np.ndarray) else "device", ) apply_msg = "{}<<<{}>>>()" apply_msg = apply_msg.format(name, copy_region) # if iteration is required, we redefine __call__ if n > 0: apply_msg += f" iterated over ndindex {iter_region}" assert src_origin_kwd in enqueue_copy_kwds assert dst_origin_kwd in enqueue_copy_kwds src_origin = enqueue_copy_kwds.pop(src_origin_kwd) dst_origin = enqueue_copy_kwds.pop(dst_origin_kwd) super().__init__( dst=dst, src=src, enqueue_copy_kwds=enqueue_copy_kwds, name=name, apply_msg=apply_msg, **kwds, ) if n > 0: def call( queue=None, wait_for=None, iter_region=iter_region, iter_src_origin=iter_src_origin, iter_dst_origin=iter_dst_origin, iter_src_pitches=iter_src_pitches, iter_dst_pitches=iter_dst_pitches, **kwds, ): if __KERNEL_DEBUG__ or __TRACE_KERNELS__: print(" " + self._apply_msg) queue = first_not_None(queue, self._default_queue) check_instance(queue, cl.CommandQueue) for idx in npw.ndindex(*iter_region): src_byte_offset = npw.dot( npw.add(iter_src_origin, idx), iter_src_pitches ) dst_byte_offset = npw.dot( npw.add(iter_dst_origin, idx), iter_dst_pitches ) _src_origin = (src_origin[0] + src_byte_offset,) + src_origin[1:] _dst_origin = (dst_origin[0] + dst_byte_offset,) + dst_origin[1:] enqueue_copy_kwds[src_origin_kwd] = _src_origin enqueue_copy_kwds[dst_origin_kwd] = _dst_origin evt = cl.enqueue_copy( queue=queue, wait_for=wait_for, **enqueue_copy_kwds ) profile_kernel(None, evt, self._apply_msg, fprofiler=self._profiler) wait_for = None return evt self.call = call else: self.call = None def __call__(self, *args, **kwds): if self.call is None: return super().__call__(*args, **kwds) else: return self.call(*args, **kwds) @classmethod def _format_slices(cls, a, slices): check_instance(a, (np.ndarray, clArray.Array, Array)) shape = a.shape dtype = a.dtype ndim = a.ndim if (not slices) or (slices is Ellipsis): slices = (Ellipsis,) check_instance(slices, tuple) # expand ellipsis if Ellipsis in slices: nellipsis = slices.count(Ellipsis) msg = "Only one Ellipsis can be passed." assert nellipsis == 1, msg eid = slices.index(Ellipsis) missing_count = ndim - len(slices) missing_slices = tuple( slice(shape[i]) for i in range(eid, eid + missing_count + 1) ) full_slices = slices[:eid] + missing_slices + slices[eid + 1 :] slices = full_slices check_instance(slices, tuple, values=(int, slice), size=ndim) # compute indices indices = () for slc, si in zip(slices, shape): if (slc.stop is not None) and (slc.stop > si): msg = "Error in slice specification: slc={} but size is only {}." msg = msg.format(slc, si) raise ValueError(msg) if isinstance(slc, slice): indices += (slc.indices(si),) else: indices += ((slc, slc + 1, 1),) nelems = tuple((idx[1] - idx[0] + idx[2] - 1) // idx[2] for idx in indices) nbytes = prod(nelems) * dtype.itemsize return slices, dtype, nelems, nbytes, indices @classmethod def _compute_region(cls, a, indices): # compute nelems and parameters check_instance(indices, tuple, values=tuple, size=a.ndim) start_offset = 0 if isinstance(a, (np.ndarray,)): data = a elif isinstance(a, (HostArray,)): data = a.handle else: try: data = a.data except clArray.ArrayHasOffsetError: data = a.base_data start_offset = a.offset if isinstance(a, Array): a = a.handle shape = a.shape strides = a.strides dtype = a.dtype estart = tuple(idx[0] for idx in indices) estop = tuple(idx[1] for idx in indices) estep = tuple(idx[2] for idx in indices) assert len(shape) == len(strides) == len(estep) == len(estart) == len(estop) _estart, _estop, _estep = ( npw.asintegerarray(_) for _ in (estart, estop, estep) ) if ((_estart % _estep) != 0).any(): msg = "Start is not aligned on step, cannot compute origin." raise ValueError(msg) if ((_estop % _estep) != 0).any(): msg = "Stop is not aligned on step, cannot compute region." raise ValueError(msg) if estep[-1] != 1: msg = "Array is not contiguous (last slice step should be 1)." raise ValueError(msg) if strides[-1] != dtype.itemsize: msg = "Array is not contiguous (last strides should be item size)." raise ValueError(msg) region, origin, pitches = (), (), () for Si, Sr, start, stop, step in zip(shape, strides, estart, estop, estep): Ni = (stop - start + step - 1) // step if (Ni <= 0) or (Ni > Si): msg = f"Ni={Ni}, Si={Si}" raise ValueError(msg) elif (not region) or (Ni <= Si): region += (Ni,) origin += (start // step,) pitches += (step * Sr,) region = np.asarray(region, dtype=np.int32) origin = np.asarray(origin, dtype=np.int32) pitches = np.asarray(pitches, dtype=np.int32) assert pitches[-1] == dtype.itemsize pitches = pitches[:-1] region[-1] *= dtype.itemsize origin[-1] *= dtype.itemsize origin[-1] += start_offset return data, region, origin, pitches
[docs] @classmethod def from_slices(cls, varname, src, dst, src_slices=None, dst_slices=None): """ Build an OpenClCopyBufferRectLauncher from source, destinations and some slices. Device arrays must be aligned to CL_DEVICE_MEM_BASE_ADDR_ALIGN. """ assert hasattr(src, "shape") assert hasattr(src, "dtype") assert hasattr(src, "strides") assert hasattr(dst, "shape") assert hasattr(dst, "dtype") assert hasattr(dst, "strides") msg0 = "OpenClCopyBufferRectLauncher.from_slices()" msg0 += "\n *Inputs were:" msg0 += "\n src: shape={}, dtype={}, slices={}" msg0 += "\n dst: shape={}, dtype={}, slices={}" msg0 += "\n *Slices conversions were:" msg0 += "\n src_slices: {}" msg0 += "\n dst_slices: {}" msg0 = msg0.format( src.shape, src.dtype, src_slices, dst.shape, dst.dtype, dst_slices, "{}", "{}", ) src_slices, src_dtype, src_nelems, src_bytes, src_indices = cls._format_slices( src, src_slices ) dst_slices, dst_dtype, dst_nelems, dst_bytes, dst_indices = cls._format_slices( dst, dst_slices ) msg0 = msg0.format(src_slices, dst_slices) if src_bytes != dst_bytes: msg0 += ( "\n >Error: byte size mismatch between source and destination slices:" ) else: msg0 += "\n *Data types and byte count:" msg0 += "\n src: nelems={}, dtype={}, bytes={} ({}B)" msg0 += "\n dst: nelems={}, dtype={}, bytes={} ({}B)" msg0 = msg0.format( src_nelems, src_dtype, bytes2str(src_bytes), src_bytes, dst_nelems, dst_dtype, bytes2str(dst_bytes), dst_bytes, ) if src_bytes != dst_bytes: raise ValueError(msg0) src_data, src_region, src_origin, src_pitches = cls._compute_region( src, src_indices ) dst_data, dst_region, dst_origin, dst_pitches = cls._compute_region( dst, dst_indices ) if (src_region != dst_region).any(): msg0 += "\n >Error: mismatch between source and destination regions:" else: msg0 += "\n *Determined regions:" msg0 += "\n src: region={}, origin={}, pitches={}" msg0 += "\n dst: region={}, origin={}, pitches={}" msg0 = msg0.format( src_region, src_origin, src_pitches, dst_region, dst_origin, dst_pitches ) if (src_region != dst_region).any(): raise ValueError(msg0) region = src_region if (region <= 0).any(): msg = "\n >Error: region is ill-formed or zero-sized:" msg += "\n region: {}" msg = msg.format(region) raise ValueError(msg0 + msg) total_dims = src_region.size copy_dims = min(total_dims, 3) iter_dims = total_dims - copy_dims assert copy_dims > 0 assert iter_dims >= 0 zero, one = np.int32(0), np.int32(1) copy_region = [one] * 3 copy_src_origin, copy_dst_origin = [zero] * 3, [zero] * 3 copy_src_pitches, copy_dst_pitches = [zero] * 2, [zero] * 2 copy_region[:copy_dims] = region[::-1][:copy_dims] copy_src_origin[:copy_dims] = src_origin[::-1][:copy_dims] copy_dst_origin[:copy_dims] = dst_origin[::-1][:copy_dims] copy_src_pitches[: copy_dims - 1] = src_pitches[::-1][: copy_dims - 1] copy_dst_pitches[: copy_dims - 1] = dst_pitches[::-1][: copy_dims - 1] copy_region = tuple(copy_region) copy_src_origin = tuple(copy_src_origin) copy_dst_origin = tuple(copy_dst_origin) copy_src_pitches = tuple(copy_src_pitches) copy_dst_pitches = tuple(copy_dst_pitches) iter_region = tuple(region[:iter_dims]) iter_src_origin = tuple(src_origin[:iter_dims]) iter_dst_origin = tuple(dst_origin[:iter_dims]) iter_src_pitches = tuple(src_pitches[:iter_dims]) iter_dst_pitches = tuple(dst_pitches[:iter_dims]) msg0 += "\n *Dimensions:" msg0 += "\n total: {}" msg0 += "\n copy: {}" msg0 += "\n iter: {}" msg0 = msg0.format(total_dims, copy_dims, iter_dims) msg0 += "\n *enqueue_copy kernel arguments:" msg0 += "\n region: {}" msg0 += "\n src: origin={}, pitches={}" msg0 += "\n dst: origin={}, pitches={}" msg0 = msg0.format( copy_region, copy_src_origin, copy_src_pitches, copy_dst_origin, copy_dst_pitches, ) msg0 += "\n *iter arguments:" msg0 += "\n region: {}" msg0 += "\n src: origin={}, pitches={}" msg0 += "\n dst: origin={}, pitches={}" msg0 = msg0.format( iter_region, iter_src_origin, iter_src_pitches, iter_dst_origin, iter_dst_pitches, ) return cls( varname=varname, src=src_data, dst=dst_data, copy_region=copy_region, copy_src_origin=copy_src_origin, copy_dst_origin=copy_dst_origin, copy_src_pitches=copy_src_pitches, copy_dst_pitches=copy_dst_pitches, iter_region=iter_region, iter_src_origin=iter_src_origin, iter_dst_origin=iter_dst_origin, iter_src_pitches=iter_src_pitches, iter_dst_pitches=iter_dst_pitches, )
[docs] class OpenClFillKernelLauncher(OpenClCopyBufferRectLauncher): """Cache buffers to perform a fill operation by using an OpenClCopyBufferRectLauncher.""" __fill_buffers = {}
[docs] @classmethod def from_slices(cls, varname, backend, fill_value, dst): if isinstance(dst, OpenClArray): assert backend == dst.backend else: assert isinstance(dst, clArray.Array) shape = dst.shape dtype = dst.dtype fill_value = dst.dtype.type(fill_value) src = cls.create_fill_buffer(backend, dtype, shape, fill_value) obj = super().from_slices(varname=varname, src=src, dst=dst) return obj
[docs] @classmethod def create_fill_buffer(cls, backend, dtype, shape, fill_value): assert backend.kind == Backend.OPENCL from hysop.tools.misc import prod size = prod(shape) key = (backend, dtype, size, fill_value) if key in cls.__fill_buffers: buf = cls.__fill_buffers[key] else: buf = backend.full(dtype=dtype, shape=shape, fill_value=fill_value) cls.__fill_buffers[key] = buf return buf.reshape(shape)